library(tidyverse)
library(tm)
library(tidytext)
library(wordcloud2)
library(ggplot2)
data=read_csv("../data/philosophy_data.csv")
hist(data$sentence_length,breaks = 200,xlab = "Length",main = "Histogram of length")

hist(data$sentence_length[which(data$sentence_length<600)],breaks = 200,xlab = "Length",main = "Histogram of length (<600)")

sum(data$sentence_length>600)/nrow(data)
[1] 0.004063103
median(data$sentence_length)
[1] 127
(g1= ggplot(data,aes(school,sentence_length,fill=school))+
geom_violin())

rus_stopwords = data.frame(word = stopwords("en"))
rus_stopwords=rbind(rus_stopwords,"one","will","can","must","things","also","may","thing","just","us","yet","else","man")
all_tokens=data%>%select(sentence_str)%>%
unnest_tokens(word, sentence_str)%>% anti_join(rus_stopwords)
all_frequency = all_tokens %>% count(word) %>% arrange(desc(n))
### wordcloud2(all_frequency)

plato_tokens=data%>%filter(school=="plato")%>%select(sentence_str)%>%
unnest_tokens(word, sentence_str)%>% anti_join(rus_stopwords)
plato_frequency = plato_tokens %>% count(word) %>% arrange(desc(n))
### wordcloud2(plato_frequency)

aristotle_tokens=data%>%filter(school=="aristotle")%>%select(sentence_str)%>%
unnest_tokens(word, sentence_str)%>% anti_join(rus_stopwords)
aristotle_frequency = aristotle_tokens %>% count(word) %>% arrange(desc(n))
### wordcloud2(aristotle_frequency)

empiricism_tokens=data%>%filter(school=="empiricism")%>%select(sentence_str)%>%
unnest_tokens(word, sentence_str)%>% anti_join(rus_stopwords)
empiricism_frequency = empiricism_tokens %>% count(word) %>% arrange(desc(n))
### wordcloud2(empiricism_frequency)

rationalism_tokens=data%>%filter(school=="rationalism")%>%select(sentence_str)%>%
unnest_tokens(word, sentence_str)%>% anti_join(rus_stopwords)
rationalism_frequency = rationalism_tokens %>% count(word) %>% arrange(desc(n))
### wordcloud2(rationalism_frequency)

analytic_tokens=data%>%filter(school=="analytic")%>%select(sentence_str)%>%
unnest_tokens(word, sentence_str)%>% anti_join(rus_stopwords)
analytic_frequency = analytic_tokens %>% count(word) %>% arrange(desc(n))
### wordcloud2(analytic_frequency)

continental_tokens=data%>%filter(school=="continental")%>%select(sentence_str)%>%
unnest_tokens(word, sentence_str)%>% anti_join(rus_stopwords)
continental_frequency = continental_tokens %>% count(word) %>% arrange(desc(n))
### wordcloud2(continental_frequency)

phenomenology_tokens=data%>%filter(school=="phenomenology")%>%select(sentence_str)%>%
unnest_tokens(word, sentence_str)%>% anti_join(rus_stopwords)
phenomenology_frequency = phenomenology_tokens %>% count(word) %>% arrange(desc(n))
### wordcloud2(phenomenology_frequency)

german_idealism_tokens=data%>%filter(school=="german_idealism")%>%select(sentence_str)%>%
unnest_tokens(word, sentence_str)%>% anti_join(rus_stopwords)
german_idealism_frequency = german_idealism_tokens %>% count(word) %>% arrange(desc(n))
### wordcloud2(german_idealism_frequency)

communism_tokens=data%>%filter(school=="communism")%>%select(sentence_str)%>%
unnest_tokens(word, sentence_str)%>% anti_join(rus_stopwords)
communism_frequency = communism_tokens %>% count(word) %>% arrange(desc(n))
### wordcloud2(communism_frequency)

capitalism_tokens=data%>%filter(school=="capitalism")%>%select(sentence_str)%>%
unnest_tokens(word, sentence_str)%>% anti_join(rus_stopwords)
capitalism_frequency = capitalism_tokens %>% count(word) %>% arrange(desc(n))
### wordcloud2(capitalism_frequency)

stoicism_tokens=data%>%filter(school=="stoicism")%>%select(sentence_str)%>%
unnest_tokens(word, sentence_str)%>% anti_join(rus_stopwords)
stoicism_frequency = stoicism_tokens %>% count(word) %>% arrange(desc(n))
### wordcloud2(stoicism_frequency)

nietzsche_tokens=data%>%filter(school=="nietzsche")%>%select(sentence_str)%>%
unnest_tokens(word, sentence_str)%>% anti_join(rus_stopwords)
nietzsche_frequency = nietzsche_tokens %>% count(word) %>% arrange(desc(n))
### wordcloud2(nietzsche_frequency)

feminism_tokens=data%>%filter(school=="feminism")%>%select(sentence_str)%>%
unnest_tokens(word, sentence_str)%>% anti_join(rus_stopwords)
Joining, by = "word"
feminism_frequency = feminism_tokens %>% count(word) %>% arrange(desc(n))
### wordcloud2(feminism_frequency)
feminism_frequency%>%filter(word=="women"|word=="woman")%>%select(word,feminism=n)%>%mutate(
all_frequency%>%filter(word=="women"|word=="woman")%>%select(all=n))
count(data,school=="feminism")

LS0tDQp0aXRsZTogIlByb2plY3QgMSBSIE5vdGVib29rIERhdGEgU3RvcnkiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpgYGB7cixyZXN1bHRzPSJoaWRlIn0NCmxpYnJhcnkodGlkeXZlcnNlKQ0KbGlicmFyeSh0bSkNCmxpYnJhcnkodGlkeXRleHQpDQpsaWJyYXJ5KHdvcmRjbG91ZDIpDQpsaWJyYXJ5KGdncGxvdDIpDQpkYXRhPXJlYWRfY3N2KCIuLi9kYXRhL3BoaWxvc29waHlfZGF0YS5jc3YiKQ0KDQpgYGANCg0KDQoNCmBgYHtyfQ0KaGlzdChkYXRhJHNlbnRlbmNlX2xlbmd0aCxicmVha3MgPSAyMDAseGxhYiA9ICJMZW5ndGgiLG1haW4gPSAiSGlzdG9ncmFtIG9mIGxlbmd0aCIpDQpoaXN0KGRhdGEkc2VudGVuY2VfbGVuZ3RoW3doaWNoKGRhdGEkc2VudGVuY2VfbGVuZ3RoPDYwMCldLGJyZWFrcyA9IDIwMCx4bGFiID0gIkxlbmd0aCIsbWFpbiA9ICJIaXN0b2dyYW0gb2YgbGVuZ3RoICg8NjAwKSIpDQpzdW0oZGF0YSRzZW50ZW5jZV9sZW5ndGg+NjAwKS9ucm93KGRhdGEpDQptZWRpYW4oZGF0YSRzZW50ZW5jZV9sZW5ndGgpDQpgYGANCg0KDQoNCmBgYHtyfQ0KKGcxPSBnZ3Bsb3QoZGF0YSxhZXMoc2Nob29sLHNlbnRlbmNlX2xlbmd0aCxmaWxsPXNjaG9vbCkpKw0KICBnZW9tX3Zpb2xpbigpKQ0KYGBgDQoNCg0KYGBge3J9DQpydXNfc3RvcHdvcmRzID0gZGF0YS5mcmFtZSh3b3JkID0gc3RvcHdvcmRzKCJlbiIpKQ0KcnVzX3N0b3B3b3Jkcz1yYmluZChydXNfc3RvcHdvcmRzLCJvbmUiLCJ3aWxsIiwiY2FuIiwibXVzdCIsInRoaW5ncyIsImFsc28iLCJtYXkiLCJ0aGluZyIsImp1c3QiLCJ1cyIsInlldCIsImVsc2UiLCJtYW4iKQ0KDQphbGxfdG9rZW5zPWRhdGElPiVzZWxlY3Qoc2VudGVuY2Vfc3RyKSU+JQ0KICB1bm5lc3RfdG9rZW5zKHdvcmQsIHNlbnRlbmNlX3N0ciklPiUgYW50aV9qb2luKHJ1c19zdG9wd29yZHMpDQphbGxfZnJlcXVlbmN5ID0gYWxsX3Rva2VucyAlPiUgY291bnQod29yZCkgJT4lIGFycmFuZ2UoZGVzYyhuKSkNCiMjIyB3b3JkY2xvdWQyKGFsbF9mcmVxdWVuY3kpDQoNCmBgYA0KDQohW10oLi4vZmlncy9hbGwucG5nKQ0KDQpgYGB7cn0NCnBsYXRvX3Rva2Vucz1kYXRhJT4lZmlsdGVyKHNjaG9vbD09InBsYXRvIiklPiVzZWxlY3Qoc2VudGVuY2Vfc3RyKSU+JQ0KICB1bm5lc3RfdG9rZW5zKHdvcmQsIHNlbnRlbmNlX3N0ciklPiUgYW50aV9qb2luKHJ1c19zdG9wd29yZHMpDQpwbGF0b19mcmVxdWVuY3kgPSBwbGF0b190b2tlbnMgJT4lIGNvdW50KHdvcmQpICU+JSBhcnJhbmdlKGRlc2MobikpDQojIyMgd29yZGNsb3VkMihwbGF0b19mcmVxdWVuY3kpDQpgYGAgICANCg0KIVtdKC4uL2ZpZ3MvcGxhdG8ucG5nKQ0KDQoNCmBgYHtyfQ0KYXJpc3RvdGxlX3Rva2Vucz1kYXRhJT4lZmlsdGVyKHNjaG9vbD09ImFyaXN0b3RsZSIpJT4lc2VsZWN0KHNlbnRlbmNlX3N0ciklPiUNCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCBzZW50ZW5jZV9zdHIpJT4lIGFudGlfam9pbihydXNfc3RvcHdvcmRzKQ0KYXJpc3RvdGxlX2ZyZXF1ZW5jeSA9IGFyaXN0b3RsZV90b2tlbnMgJT4lIGNvdW50KHdvcmQpICU+JSBhcnJhbmdlKGRlc2MobikpDQojIyMgd29yZGNsb3VkMihhcmlzdG90bGVfZnJlcXVlbmN5KQ0KYGBgDQoNCiFbXSguLi9maWdzL2FyaXN0b3RsZS5wbmcpDQoNCmBgYHtyfQ0KZW1waXJpY2lzbV90b2tlbnM9ZGF0YSU+JWZpbHRlcihzY2hvb2w9PSJlbXBpcmljaXNtIiklPiVzZWxlY3Qoc2VudGVuY2Vfc3RyKSU+JQ0KICB1bm5lc3RfdG9rZW5zKHdvcmQsIHNlbnRlbmNlX3N0ciklPiUgYW50aV9qb2luKHJ1c19zdG9wd29yZHMpDQplbXBpcmljaXNtX2ZyZXF1ZW5jeSA9IGVtcGlyaWNpc21fdG9rZW5zICU+JSBjb3VudCh3b3JkKSAlPiUgYXJyYW5nZShkZXNjKG4pKQ0KIyMjIHdvcmRjbG91ZDIoZW1waXJpY2lzbV9mcmVxdWVuY3kpDQpgYGANCg0KIVtdKC4uL2ZpZ3MvZW1waXJpY2lzbS5wbmcpDQoNCmBgYHtyfQ0KcmF0aW9uYWxpc21fdG9rZW5zPWRhdGElPiVmaWx0ZXIoc2Nob29sPT0icmF0aW9uYWxpc20iKSU+JXNlbGVjdChzZW50ZW5jZV9zdHIpJT4lDQogIHVubmVzdF90b2tlbnMod29yZCwgc2VudGVuY2Vfc3RyKSU+JSBhbnRpX2pvaW4ocnVzX3N0b3B3b3JkcykNCnJhdGlvbmFsaXNtX2ZyZXF1ZW5jeSA9IHJhdGlvbmFsaXNtX3Rva2VucyAlPiUgY291bnQod29yZCkgJT4lIGFycmFuZ2UoZGVzYyhuKSkNCiMjIyB3b3JkY2xvdWQyKHJhdGlvbmFsaXNtX2ZyZXF1ZW5jeSkNCmBgYA0KDQohW10oLi4vZmlncy9yYXRpb25hbGlzbS5wbmcpDQoNCmBgYHtyfQ0KYW5hbHl0aWNfdG9rZW5zPWRhdGElPiVmaWx0ZXIoc2Nob29sPT0iYW5hbHl0aWMiKSU+JXNlbGVjdChzZW50ZW5jZV9zdHIpJT4lDQogIHVubmVzdF90b2tlbnMod29yZCwgc2VudGVuY2Vfc3RyKSU+JSBhbnRpX2pvaW4ocnVzX3N0b3B3b3JkcykNCmFuYWx5dGljX2ZyZXF1ZW5jeSA9IGFuYWx5dGljX3Rva2VucyAlPiUgY291bnQod29yZCkgJT4lIGFycmFuZ2UoZGVzYyhuKSkNCiMjIyB3b3JkY2xvdWQyKGFuYWx5dGljX2ZyZXF1ZW5jeSkNCmBgYA0KDQohW10oLi4vZmlncy9hbmFseXRpYy5wbmcpDQoNCmBgYHtyfQ0KY29udGluZW50YWxfdG9rZW5zPWRhdGElPiVmaWx0ZXIoc2Nob29sPT0iY29udGluZW50YWwiKSU+JXNlbGVjdChzZW50ZW5jZV9zdHIpJT4lDQogIHVubmVzdF90b2tlbnMod29yZCwgc2VudGVuY2Vfc3RyKSU+JSBhbnRpX2pvaW4ocnVzX3N0b3B3b3JkcykNCmNvbnRpbmVudGFsX2ZyZXF1ZW5jeSA9IGNvbnRpbmVudGFsX3Rva2VucyAlPiUgY291bnQod29yZCkgJT4lIGFycmFuZ2UoZGVzYyhuKSkNCiMjIyB3b3JkY2xvdWQyKGNvbnRpbmVudGFsX2ZyZXF1ZW5jeSkNCmBgYA0KDQohW10oLi4vZmlncy9jb250aW5lbnRhbC5wbmcpDQoNCmBgYHtyfQ0KcGhlbm9tZW5vbG9neV90b2tlbnM9ZGF0YSU+JWZpbHRlcihzY2hvb2w9PSJwaGVub21lbm9sb2d5IiklPiVzZWxlY3Qoc2VudGVuY2Vfc3RyKSU+JQ0KICB1bm5lc3RfdG9rZW5zKHdvcmQsIHNlbnRlbmNlX3N0ciklPiUgYW50aV9qb2luKHJ1c19zdG9wd29yZHMpDQpwaGVub21lbm9sb2d5X2ZyZXF1ZW5jeSA9IHBoZW5vbWVub2xvZ3lfdG9rZW5zICU+JSBjb3VudCh3b3JkKSAlPiUgYXJyYW5nZShkZXNjKG4pKQ0KIyMjIHdvcmRjbG91ZDIocGhlbm9tZW5vbG9neV9mcmVxdWVuY3kpDQpgYGANCg0KIVtdKC4uL2ZpZ3MvcGhlbm9tZW5vbG9neS5wbmcpDQoNCmBgYHtyfQ0KZ2VybWFuX2lkZWFsaXNtX3Rva2Vucz1kYXRhJT4lZmlsdGVyKHNjaG9vbD09Imdlcm1hbl9pZGVhbGlzbSIpJT4lc2VsZWN0KHNlbnRlbmNlX3N0ciklPiUNCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCBzZW50ZW5jZV9zdHIpJT4lIGFudGlfam9pbihydXNfc3RvcHdvcmRzKQ0KZ2VybWFuX2lkZWFsaXNtX2ZyZXF1ZW5jeSA9IGdlcm1hbl9pZGVhbGlzbV90b2tlbnMgJT4lIGNvdW50KHdvcmQpICU+JSBhcnJhbmdlKGRlc2MobikpDQojIyMgd29yZGNsb3VkMihnZXJtYW5faWRlYWxpc21fZnJlcXVlbmN5KQ0KYGBgDQoNCiFbXSguLi9maWdzL2dlcm1hbl9pZGVhbGlzbS5wbmcpDQoNCmBgYHtyfQ0KY29tbXVuaXNtX3Rva2Vucz1kYXRhJT4lZmlsdGVyKHNjaG9vbD09ImNvbW11bmlzbSIpJT4lc2VsZWN0KHNlbnRlbmNlX3N0ciklPiUNCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCBzZW50ZW5jZV9zdHIpJT4lIGFudGlfam9pbihydXNfc3RvcHdvcmRzKQ0KY29tbXVuaXNtX2ZyZXF1ZW5jeSA9IGNvbW11bmlzbV90b2tlbnMgJT4lIGNvdW50KHdvcmQpICU+JSBhcnJhbmdlKGRlc2MobikpDQojIyMgd29yZGNsb3VkMihjb21tdW5pc21fZnJlcXVlbmN5KQ0KYGBgDQoNCiFbXSguLi9maWdzL2NvbW11bmlzbS5wbmcpDQoNCmBgYHtyfQ0KY2FwaXRhbGlzbV90b2tlbnM9ZGF0YSU+JWZpbHRlcihzY2hvb2w9PSJjYXBpdGFsaXNtIiklPiVzZWxlY3Qoc2VudGVuY2Vfc3RyKSU+JQ0KICB1bm5lc3RfdG9rZW5zKHdvcmQsIHNlbnRlbmNlX3N0ciklPiUgYW50aV9qb2luKHJ1c19zdG9wd29yZHMpDQpjYXBpdGFsaXNtX2ZyZXF1ZW5jeSA9IGNhcGl0YWxpc21fdG9rZW5zICU+JSBjb3VudCh3b3JkKSAlPiUgYXJyYW5nZShkZXNjKG4pKQ0KIyMjIHdvcmRjbG91ZDIoY2FwaXRhbGlzbV9mcmVxdWVuY3kpDQpgYGANCg0KIVtdKC4uL2ZpZ3MvY2FwaXRhbGlzbS5wbmcpDQoNCmBgYHtyfQ0Kc3RvaWNpc21fdG9rZW5zPWRhdGElPiVmaWx0ZXIoc2Nob29sPT0ic3RvaWNpc20iKSU+JXNlbGVjdChzZW50ZW5jZV9zdHIpJT4lDQogIHVubmVzdF90b2tlbnMod29yZCwgc2VudGVuY2Vfc3RyKSU+JSBhbnRpX2pvaW4ocnVzX3N0b3B3b3JkcykNCnN0b2ljaXNtX2ZyZXF1ZW5jeSA9IHN0b2ljaXNtX3Rva2VucyAlPiUgY291bnQod29yZCkgJT4lIGFycmFuZ2UoZGVzYyhuKSkNCiMjIyB3b3JkY2xvdWQyKHN0b2ljaXNtX2ZyZXF1ZW5jeSkNCmBgYA0KDQohW10oLi4vZmlncy9zdG9pY2lzbS5wbmcpDQoNCmBgYHtyfQ0KbmlldHpzY2hlX3Rva2Vucz1kYXRhJT4lZmlsdGVyKHNjaG9vbD09Im5pZXR6c2NoZSIpJT4lc2VsZWN0KHNlbnRlbmNlX3N0ciklPiUNCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCBzZW50ZW5jZV9zdHIpJT4lIGFudGlfam9pbihydXNfc3RvcHdvcmRzKQ0KbmlldHpzY2hlX2ZyZXF1ZW5jeSA9IG5pZXR6c2NoZV90b2tlbnMgJT4lIGNvdW50KHdvcmQpICU+JSBhcnJhbmdlKGRlc2MobikpDQojIyMgd29yZGNsb3VkMihuaWV0enNjaGVfZnJlcXVlbmN5KQ0KYGBgDQoNCiFbXSguLi9maWdzL25pZXR6c2NoZS5wbmcpDQogDQogDQpgYGB7cn0NCmZlbWluaXNtX3Rva2Vucz1kYXRhJT4lZmlsdGVyKHNjaG9vbD09ImZlbWluaXNtIiklPiVzZWxlY3Qoc2VudGVuY2Vfc3RyKSU+JQ0KICB1bm5lc3RfdG9rZW5zKHdvcmQsIHNlbnRlbmNlX3N0ciklPiUgYW50aV9qb2luKHJ1c19zdG9wd29yZHMpDQpmZW1pbmlzbV9mcmVxdWVuY3kgPSBmZW1pbmlzbV90b2tlbnMgJT4lIGNvdW50KHdvcmQpICU+JSBhcnJhbmdlKGRlc2MobikpDQojIyMgd29yZGNsb3VkMihmZW1pbmlzbV9mcmVxdWVuY3kpDQpmZW1pbmlzbV9mcmVxdWVuY3klPiVmaWx0ZXIod29yZD09IndvbWVuInx3b3JkPT0id29tYW4iKSU+JXNlbGVjdCh3b3JkLGZlbWluaXNtPW4pJT4lbXV0YXRlKA0KYWxsX2ZyZXF1ZW5jeSU+JWZpbHRlcih3b3JkPT0id29tZW4ifHdvcmQ9PSJ3b21hbiIpJT4lc2VsZWN0KGFsbD1uKSkNCmNvdW50KGRhdGEsc2Nob29sPT0iZmVtaW5pc20iKQ0KYGBgDQoNCiFbXSguLi9maWdzL2ZlbWluaXNtLnBuZykNCg0KDQoNCg0K